# Importation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
from models import *
# Merge of the two datasets
tmdb = pd.read_csv("../Data/TMDB/tmdb_5000_movies.csv")
tmdb2 = pd.read_csv("../Data/TMDB/tmdb_5000_credits.csv")
tmdb2.rename(columns={'movie_id':'id'}, inplace=True)
data = pd.merge(tmdb,tmdb2, on='id')
#Â Print the columns
print(data.columns)
### We need to be careful of the missing values
data.isnull().sum()
def drop_useless_data(ds) :
# It drops instances where it misses some value
# It drops also data where the time is lower than 1985
# Drops vote count
ds = ds.drop(ds.vote_count[ ds.vote_count < 10].index)
# Drops overview
index_overview_nan = ds.overview[ds.overview.isnull()].index
ds = ds.drop(index_overview_nan)
# Delete instances where the runtime is null
ds = ds.drop( ds.runtime[ds.runtime == 0].index)
# Delete instances where the budget is null
ds = ds.drop( ds.budget[ds.budget == 0].index)
# Delete instances where the revenue is null
ds = ds.drop( ds.revenue[ds.revenue == 0].index)
# Reset the indexes
ds = ds.reset_index()
return ds
data = drop_useless_data(data)
# It seems that homepage and tagline have missing values
# Nevertheless, homepage is a string of the summary of the movie. We will take care of it only if we have time.
# Tagline gives kind of slogan of the movie. We don't take care of it now
# Then we convert the time
# Conversion of the time
def convert_date_time(ds,name) :
# Returns new database where the string date time is converted to date type
new_ds = ds.copy()
new_ds[name] = pd.to_datetime(ds[name])
return new_ds
data = convert_date_time(data,'release_date')
### Then we drop all the instances where the movies are inferior than 1985
### It seems obsolete for our criteria
def drop_time(ds) :
new_ds = ds.copy()
time = pd.to_datetime('1985-01-01')
indexes = new_ds.release_date[new_ds.release_date < time].index
new_ds = new_ds.drop(indexes)
new_ds = new_ds.reset_index()
return new_ds
data = drop_time(data)
### Create a feature for the year
### 1-of-K encoding for the months
def encode_year(ds) :
# Creates a new column with the year
X = ds.copy()
X['year'] = ds.release_date.apply(lambda x : x.year)
return X
def encode_month(ds) :
# Creates new columns for months
X = ds.copy()
X['month'] = X.release_date.apply(lambda x : x.month)
new_col = pd.get_dummies(X['month'])
columns_name = {float(i) : 'month'+'_'+str(i) for i in range(1,13)}
new_col.rename(columns = columns_name, inplace=True)
ds_conc = pd.concat([X, new_col], axis=1)
return ds_conc
data = encode_year(data)
data =encode_month(data)
### Vizualisation of the data in order to convert it properly
import json
def get_json(ds,name) :
# Return a list with the different json file in the column name and the count of it
genres = []
count = []
for x in ds[name] :
dictionnary = json.loads(x)
for y in dictionnary :
if not (y.get("name") in genres) :
genres.append(y.get("name"))
count.append(1)
else :
count[genres.index(y.get("name"))]+=1
return genres,count
def plotJson(datas,name,sizeX) :
# Vizualisation of the popularity of a given column
# sizeX is the size of the x_axis plotted
genres,count = get_json(datas,name)
index_sort = np.argsort(count)
genres = np.flip(np.array(genres)[index_sort],axis=0)
count = np.flip(np.array(count)[index_sort],axis=0)
y_axis = count
x_axis = [i for i in range(len(count))]
x_label = genres
plt.xticks(rotation=90, fontsize = 15)
plt.ylabel("Number of occurences")
if ( len(x_axis) > 25 ) :
# It doesn't look great if the len is > to 25
plt.bar(x_axis[:25], y_axis[:25], align = 'center', color='b')
plt.xticks(x_axis[:25], x_label[:25])
else :
plt.bar(x_axis, y_axis, align = 'center', color='b')
plt.xticks(x_axis, x_label)
plt.xlabel(name)
plt.title(name + " popularity")
plt.show()
plotJson(data,'genres',15)
plotJson(data,'spoken_languages',0.05)
### Dealing with the occurences for genres
### The aim here is to keep the N most relevant values and to make the other as a "garbage" value
def group_by_occurences(ds,name,n) :
# It splits the column "name" in n different features
# It keeps the n-1 most important ones, and makes the others as another feature
# It does it only for the json files
# It returns the n names of the columns
# Gives the different types and the occurences of each
genres,count = get_json(ds,name)
# Gives the indexes sorted of the list
indexes = np.argsort(count)
# We want to have the maximum, not the minimum : so we inverse the result
indexes = [indexes[len(indexes)-i-1] for i in range(len(indexes)) ]
# We get the result of the sort
sorted_genres = np.array(genres)[indexes]
sorted_count = np.array(count)[indexes]
# We take the n-1 column values
result = sorted_genres[:n-1]
result = np.append(result,name+"_others")
return result
def return_indexes_json(ds,name,category) :
# Gives the indexes of istances where the json category appears
genres = []
# It will give all the different category of the column name
for index,x in enumerate(ds[name]) :
# Convert json file into dictionnary
dictionnary = json.loads(x)
for y in dictionnary :
if(y.get("name") == category ) :
genres.append(index)
return genres
def create_columns_by_occurences(ds,name,n) :
# It adds n columns to ds
# It converts the n columns by adding 1-of-K encoding
X = ds.copy()
column_usable = group_by_occurences(ds,name,n)
new_vals = np.zeros( (len(column_usable),), dtype=int )
new_cols = np.append(X.columns.tolist(), column_usable)
for x in column_usable :
(n,p) = X.shape
initial = [0 for i in range(n)]
X.insert(p,x,initial,True)
X[column_usable] = new_vals
indexes_fullfill = []
for i in range(len(column_usable)-1) :
# Makes 1 where the values should be
x = column_usable[i]
indexes = return_indexes_json(ds,name,x)
indexes_fullfill = indexes_fullfill + indexes
X[x][indexes] = 1
col = X[column_usable[len(column_usable)-1]]
X[column_usable[len(column_usable)-1]] = 1
X[column_usable[len(column_usable)-1]][indexes_fullfill] = 0
X =X.drop(name,axis=1)
return X
# 1-of-K encoding for the genres. 50 is a value superior to take into account all the values of genre
data = create_columns_by_occurences(data,"genres",50)
#data = create_columns_by_occurences(data,"spoken_languages",5)
# We first added spoken_languages by keeping the 5 most occured languages
# Then we realised that this feature isn't so relevant (few words in a given movie will be taken into account
# in spoken_languages). Indeed, it doesn't seem important
# We need to take into account the fact that a vote with a lot of vote_count is more accurate
# Thus, we need to find a way to create a new column to take this into account
# By using the Chebyshev's inequality, we can get that the probability of the expected vote of a film to be greater than
# vote_average - 10/sqrt(vote_count) is 0,99 (if we consider that the variance of vote is equal to 1)
data['grade'] = data.vote_average - 10/np.sqrt(data.vote_count)
# Plots the ratio rate
plt.plot(data['budget'], data['revenue'], '+',color='r')
plt.ylabel('Revenue')
plt.xlabel('Budget')
plt.title('Ratio graph')
plt.show()
# Plots grande and vote_average depending on budget
plt.plot(data.budget, data.vote_average,color='r', marker='+',linestyle='None',label='Vote average')
plt.plot(data.budget, data.grade,color='b', marker='+',linestyle='None',label='Grade')
plt.xlabel('budget')
plt.legend()
plt.title("Differences between vote average and grade depending on the budget")
plt.show()
### Create a new column ratio to take into account both budget and revenue
data['ratio'] = data.budget / data.revenue
data.ratio[:5]
### We sort columns by grade and we drop vote, vote_count and popularity because they are no longer relevant
data.sort_values(by=['grade'],ascending = False)
data = data.drop(['vote_average','vote_count','popularity'],axis = 1)
# We use data about oscar nominees to give an "oscar_score" to each film,
# depending on the number of people in the cast that have already been nominee for an oscar.
oscars = pd.read_csv("../Data/Oscar/the_oscar_award.csv")
oscars = oscars[oscars['year_film'] > 1970]
oscarCount = oscars.groupby("name").agg("count").reset_index()[['name', 'film']].rename(columns={'film': 'count'}).sort_values(by = 'count', ascending = False)
oscar_name_string = oscarCount.name.to_string(index=False).lower()
for index, row in data.iterrows():
oscar_nominee = 0
oscar_score = 0
nominations=[]
dic = json.loads(row.cast)
for ele in dic:
nom = ele.get('name').lower()
if (oscar_name_string.find(nom) > -1):
oscar_nominee += 1
if (oscar_nominee > 0):
for s in range(1,oscar_nominee+1):
oscar_score = oscar_score + (1/2)**s
oscar_score = oscar_score*100
data.loc[index, 'oscar_score'] = oscar_score
def sort_by_column(ds,columns) :
# Returns a dataset sorted by the following column
db = ds.copy().sort_values(by=columns)
return db
# We need to take into account the time when the movies were released
data = sort_by_column(data, ['year', 'month'])
### First of all, we will convert the string into dictonnary
### Then, we count the occurences in order to know what is most used
### Afterthat, we will only keep the first N ones which are most used
### Convert the column into dictionnary and count the occurences
def get_json(ds,name) :
# Return a list with the different json file in the column name and the count of it
genres = []
count = []
for x in ds[name] :
dictionnary = json.loads(x)
for y in dictionnary :
if not (y.get("name") in genres) :
genres.append(y.get("name"))
count.append(1)
else :
count[genres.index(y.get("name"))]+=1
return genres,count
def sup_to_value(liste1,liste2,value) :
# Returns two lists where every elements of the second list is higher than "value"
liste_result1,liste_result2 = [],[]
for index,x in enumerate(liste1) :
if liste2[index] >= value :
liste_result1.append(x)
liste_result2.append(liste2[index])
return liste_result1,liste_result2
def plotJsonLimited(ds,name,sizeX,value_min) :
# Vizualisation of the popularity of a given column
# sizeX is the size of the x_axis plotted
genres,count = get_json(ds,name)
x_axis,y_axis = sup_to_value(genres,count, value_min)
plt.xticks(rotation=90, fontsize = 15)
plt.ylabel("Number of occurences")
plt.xlabel(name)
plt.bar(x_axis, y_axis, align = 'center', color='b')
plt.title(name + " popularity")
plt.show()
def get_N_first(ds,name,N) :
# Gives the N first name and occurences of a given category coded in json
genres,count = get_json(ds,name)
value_min = np.flip(np.sort(count),axis=0)[N]
types, count = sup_to_value(genres,count, value_min)
count_indexes = np.argsort(count)
types = np.flip(np.array(types)[count_indexes],axis=0)
count = np.flip(np.array(count)[count_indexes],axis=0)
return types,count
def get_N_first_all(ds,name) :
# Gives the occurences of a given category coded in json
genres,count = get_json(ds,name)
value_min = np.flip(np.sort(count),axis=0)[-1]
types, count = sup_to_value(genres,count, value_min)
count_indexes = np.argsort(count)
types = np.flip(np.array(types)[count_indexes],axis=0)
count = np.flip(np.array(count)[count_indexes],axis=0)
return types,count
def plot_N_first(ds,name,N) :
# Plots the N first name and occurences of a given category coded in json
# Nevertheless, it returns all the occurences (not the N first)
types,count = get_N_first_all(ds,name)
plt.xticks(rotation=90, fontsize = 15)
plt.ylabel("Number of occurences")
plt.xlabel(name)
plt.bar(types[:N],count[:N], align = 'center', color='b')
plt.title(name + " popularity")
#plt.show()
return types,count
#plot_N_first(data, "production_countries",20)
types_p_companies, count_t_companies = plot_N_first(data, "production_companies",20)
#types_cast, count_cast = plot_N_first(data, "cast",20)
#types_crew, count_crew = plot_N_first(data, "crew",20)
### Here we do the 1-of-K encoding for the 5 most important production_countries
data = create_columns_by_occurences(data,"production_countries",5)
### We want to compute 1-of-K encoding for the production_companies
### Nevertheless, we will before categorize it in 3 categories
### To do that, we take into account the previous plot and we split by 3
### In this effect, when the occurences are > 85 or between 85 and 20 and <20
def get_index_json(x,liste_companies) :
# Gives the instance indexes where the companies are
print(x)
dictionnary = json.loads(x)
liste_index = []
for y in dictionnary :
name = y.get("name")
for i,x in enumerate(liste_companies) :
if name in x :
liste_index.append(i)
break
if liste_index == [] :
return 0
return np.amax(liste_index)
def split_name_prod_comp(ds) :
# Splits data into categories for the production companies
high_comp_indexes = np.where(count_t_companies > 85)[0]
high_comp = types_p_companies[high_comp_indexes]
medium_comp_indexes = np.where( (count_t_companies <= 85) & (count_t_companies > 45) )[0]
medium_comp = types_p_companies[medium_comp_indexes]
low_comp_indexes = np.where(count_t_companies <= 45)[0]
low_comp = types_p_companies[low_comp_indexes]
all_companies = [low_comp,medium_comp,high_comp]
print(ds.production_companies)
ds.production_companies = ds.production_companies.apply(get_index_json, liste_companies = all_companies)
ds = encode_language(ds.copy(),"production_companies",'prod_company')
return ds
#data = split_name_prod_comp(data)
#print(data.head(2))
data_lang = data.groupby(['original_language']).size().reset_index(name='Occurences')
data_lang.sort_values(by='Occurences',ascending=False).plot(kind='bar',\
color='b', label='Revenue', grid=True, linestyle='-' )
plt.ylabel("Occurences")
plt.xlabel("Original languages")
plt.title('Occurences of original languages')
def update_original_language(ds) :
# Keeps the most important language
# Fills the others with the value "others"
ds.original_language = ds.original_language.map({'en' : 'en', 'fr' : 'fr', 'es' : 'es', 'de' :'de' ,'zh' : 'zh'})
ds.original_language = ds.original_language.fillna('others')
return ds
def encode_language(ds,name,prefix_name) :
# 1-of-K encoding for name
new_col = pd.get_dummies(ds[name],prefix = prefix_name)
ds_conc = pd.concat([ds, new_col], axis=1)
return ds_conc
data = update_original_language(data)
data = encode_language(data,'original_language','language')
### We will encode the runtime into different categories
def encode_runtime(ds,name) :
ds[name].loc[ ds[name] < 60 ] = 1
ds[name].loc[ (ds[name] >= 60) & (ds[name] < 90) ] = 2
ds[name].loc[ (ds[name] >= 90) & (ds[name] < 120) ] = 3
ds[name].loc[ ds[name] >= 120 ] = 4
liste_time = ["less than 60 minutes","between 60 minutes and 90 minutes", \
"between 90 minutes and 120 minutes", "more than 120 minutes"]
number_time = "Number of time of movies that last : "
#for i in range(4) :
#print(number_time + liste_time[i])
#print(len(ds[name][ds[name] == i+1]))
return ds
data = encode_runtime(data,"runtime")
### 1-of-K encoding for the runtime
def encoding(ds,name) :
new_col = pd.get_dummies(ds[name])
new_col.rename(columns = {1.0 : 'short_time',2.0:'medium_time',3.0:'quite_long_time',\
4.0:'long_time'}, inplace=True)
ds_conc = pd.concat([ds, new_col], axis=1)
return ds_conc
data = encoding(data,"runtime")
def remove_unused_data(ds,liste_columns) :
# Removes all the features unused
for x in liste_columns :
ds = ds.drop(x, axis=1)
return ds
unused = ['homepage','original_title','overview','runtime','status','tagline','title_x' ,\
'title_y','original_language','production_companies',\
'level_0','index','id','release_date','spoken_languages','month']
data = remove_unused_data(data,unused)
def get_data_from_db_by_gender(ds) :
# We suppose that ds is already ordered by the year
# It will return all the database split by genres
db = ds.copy()
i_first = np.where(db.columns == "Drama")[0][0]
i_last = np.where(db.columns == 'genres_others')[0][0]
liste_genre = [ db.columns[i] for i in range(i_first, i_last + 1)]
data_genres = []
for i,name in enumerate(liste_genre) :
current_db = db[db[name] == 1]
current_db = current_db.drop(liste_genre,1)
data_genres.append(current_db)
return data_genres,liste_genre
def get_data_from_db_by_gender_n(ds,n) :
# It will return all the n first databases split by genres
data_genres,liste_genre = get_data_from_db_by_gender(ds)
print(liste_genre[:n])
return data_genres[:n],liste_genre[:n]
# Creation of our database for each genre
# We only use it now for the vizualisation
data_genres,liste_genre = get_data_from_db_by_gender(data)
def plot_gender(data_genres,liste_genre) :
# Plot the gender repartition through our dataset
# We plot only the 15 first ones because otherwise we don't see everything in the plot
sizes = []
for x in data_genres[:15] :
current_size = x.shape[0]
sizes.append(current_size)
fig = plt.figure(figsize=(9,9))
plt.pie(sizes, labels=liste_genre[:15],autopct='%1.1f%%',shadow=True)
plt.title("Repartition of genre on our dataset")
plt.show()
# Vizualisation of the proportion of the genre
plot_gender(data_genres,liste_genre)
# Keywords : gets the average note of a given keyword
# Then : we make the average of each keyword for a given movie
# As we have a grade for each keyword, we will take the average for a given movie of all keywords
### Convert the column into dictionnary and get all the popularity_rate for each
def get_popularity_by_name(ds,name) :
# Return a list with the different json file in the column name and the count of it
genres = []
popularities = []
for index,x in enumerate(ds[name]) :
dictionnary = json.loads(x)
current_grade = np.array(ds["grade"])[index]
for y in dictionnary :
if not (y.get("name") in genres) :
genres.append(y.get("name"))
partial_pop = [current_grade]
popularities.append(partial_pop)
else :
popularities[genres.index(y.get("name"))].append(current_grade)
average_pop = []
for x in popularities :
average_pop.append(np.mean(x))
return genres,average_pop
def sort_two_lists(liste1,liste2) :
# Sort the two lists with liste2 to be ordered
indexes = np.argsort(liste2)
liste_res_1 = np.flip(np.array(liste1)[indexes],axis=0)
liste_res_2 = np.flip(np.array(liste2)[indexes],axis=0)
return liste_res_1,liste_res_2
def plot_bar(x,y,x_name,y_name,title) :
# Plots as a bar
plt.xticks(rotation=90, fontsize = 15)
plt.ylabel(y_name)
plt.xlabel(x_name)
plt.bar(x,y, align = 'center', color='b')
plt.title(title)
plt.show()
def plot_popularity_keyword(N) :
# Plot the N first keywords
keyword,popularity = get_popularity_by_name(data,"keywords")
keyword,popularity = sort_two_lists(keyword,popularity)
keyword ,popularity = keyword[:N],popularity[:N]
plot_bar(keyword,popularity,"Keywords","Popularity","Popularity of keywords")
def convert_keyword_grade(x,K,P) :
# Applies P[index] with index indicating where is K in the instance x
dictionnary = json.loads(x)
liste_grade = []
for y in dictionnary :
index = np.where(K == y.get("name"))[0][0]
liste_grade.append(P[index])
return np.mean(liste_grade)
def process_by_grade(ds,name) :
# Grades the given name by taking the average
X = ds.copy()
keyword,popularity = get_popularity_by_name(X,name)
keyword,popularity = sort_two_lists(keyword,popularity)
new_name = name+'_grade'
X[new_name] = X[name]
X[new_name]= X[new_name].apply(convert_keyword_grade, K = keyword, P = popularity)
X = X.drop(name, axis=1)
return X
def process_grade_genre(liste_genres) :
# Returns two dataframe with the train and test
# It adds the grade for the features keywords, cast and crew
liste_to_process = ['keywords','cast','crew']
liste_train, liste_test = [],[]
for df in liste_genres :
train, test = train_test_split(df, test_size=0.2, shuffle = False)
for x in liste_to_process :
train = process_by_grade(train,x)
test = process_by_grade(test,x)
liste_train.append(train)
liste_test.append(test)
return liste_train, liste_test
### We need to drop the instances where the keywords_grade is null
def drop_nan_grade(ds) :
X = ds.copy()
indexes = X.keywords_grade[X.keywords_grade.isnull()].index
X = X.drop(indexes)
X = X.reset_index(drop=True)
return X
def get_clean_value(train, test) :
# Return the train and dataset where we drawback the nan values for the grade
new_train, new_test = [],[]
for index, x in enumerate(train) :
new_train.append(drop_nan_grade(x))
new_test.append(drop_nan_grade(test[index]))
return new_train,new_test
from sklearn.model_selection import train_test_split
def split_data(df,df1, label) :
# Splits the inputs into training data and test data according to the label
X_train = df.drop(label, 1).values
y_train = df[label].values
X_test = df1.drop(label, 1).values
y_test = df1[label].values
return X_train, X_test, y_train, y_test
# Class that will allow us to facilitate the use of parameters
class SKlearnHelper(object):
def __init__(self, clf, seed=0, params=None):
#params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
# Trains the classifier
self.clf.fit(x_train, y_train)
def predict(self, x):
# Predicts the output given a dataset
return self.clf.predict(x)
def fit(self,x,y):
# Fits the given data and the output
return self.clf.fit(x,y)
Here we'll define the coefficient of determination, which we're going to call it the Score for a Regressor :
$$ R^{2} = 1 - \frac{ \sum_{i} y_{i} - \hat y_{i} }{\sum_{i} y_{i} - \bar y_{i}}$$where $y_{i}$ is the true output, $\hat y_{i}$ is the predicted one and $\bar y_{i}$ is the mean of the real output
# Validation step : we do this only to get the best parameters from our models
# Then, we'll keep the best classifier / regressor for testing it with the test set
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score
# Number of folds for the validation step
NFOLDS = 5
# We use a time separation : our data is time-dependent
tscv = TimeSeriesSplit(n_splits=NFOLDS)
def get_oof(clf, x_train, y_train) :
# Out of fold : returns the average score after doing TimeSeriesSplit
# This is only made for regressors (we'll use another criteria for the classifiers)
oof_score = np.zeros( (NFOLDS))
for i,(train_index, test_index) in enumerate(tscv.split(x_train)) :
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
y_te = y_train[test_index]
clf.fit(x_tr,y_tr)
oof_y_predict = clf.predict(x_te)
# Coefficient of determination defined above
oof_score[i] = r2_score(y_te,oof_y_predict)
return np.mean(oof_score)
def get_score(model, x_train, y_train, x_test, y_test) :
# Get the final score after getting the best parameters
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
return r2_score(y_test, y_pred)
data.columns
# Compute the grade for keywords, crew and cast for all the data
# As we don't want to cheat, we can't compute the grade for all the data and make it in both the train and
# the test.
# We compute here the grade for each of these categories for all the data
# Then, we'll compute it for only the train set
# For the test set, it will have access to the all data. So we'll keep all the dataset to give the grade
liste_all_grade = []
for x in ['keywords','cast','crew'] :
liste_all_grade.append(process_by_grade(data,x))
# Class to get the split of our data depending on genre, removing some features, etc
# It will focus on the computation of the keyword_grade, cast_grade and crew_grade
from sklearn.model_selection import train_test_split
import copy
class Preprocess:
def __init__(self, dataset) :
# We initialize our preprocess with a dataset
# It will have a train and test part after spliting the dataset into these two subsets
self.ds = sort_by_column(dataset, ['year'])
self.data_genre = None
self.train, self.test = [],[]
def split_by_genre(self,n):
# We return only the n first genre
# It splits the data by genre
self.data_genre , _ = get_data_from_db_by_gender_n(self.ds,n)
def split_train_test(self) :
# It will split the data into train and test sets
# It has 20% of test and 80% of the data for the train
if self.data_genre is None :
train, test = train_test_split(self.ds, test_size=0.2)
self.train, self.test = [train], [test]
else :
train_g, test_g = [],[]
for df in self.data_genre :
train, test = train_test_split(df, test_size=0.2)
train_g.append(train)
test_g.append(test)
self.train, self.test = train_g, test_g
def compute_grade(self,liste_all) :
# Compute grade for the keywords, cast and crew for the train and the test
# It will only compute the following grade from the train set to give a grade
# For the test set, it will keep all the dataset to give the grade
liste_to_process = ['keywords','cast','crew']
liste_train, liste_test = [],[]
for index,data in enumerate(self.train) :
for i,x in enumerate(liste_to_process) :
if i == 0 :
train = data.copy()
test = self.test[index].copy()
train = process_by_grade(train,x)
test[x+'_grade'] = liste_all[i][x+'_grade'][test.index]
test.drop(x,axis=1, inplace = True)
liste_train.append(train)
liste_test.append(test)
liste_train, liste_test = get_clean_value(liste_train, liste_test)
self.train, self.test = liste_train, liste_test
# We get our work from our previous work for preprocessing the data
pre_process = Preprocess(data)
# We split it into train and test
pre_process.split_train_test()
# Then we compute the grade for the 3 categories keywords, crew and cast
pre_process.compute_grade(liste_all_grade)
# Prints the values of revenue in order to use it after for discretization
print(pre_process.ds.revenue.describe())
def discretize_grade_100(liste_ds) :
# It will create 100 label for the grade
# Multiplie by 10 and then take the int of the grade
new_ds = []
for df in liste_ds :
new_df = df.copy()
new_df.grade = df.grade * 10
new_df.grade = new_df.grade.apply(lambda x : int(x))
new_ds.append(new_df)
return new_ds
def labelize(x) :
# It will create labels between 1 and 19
if x<1 :
return 1
elif x>=1 and x<1.5 :
return 2
elif x>=1.5 and x<2 :
return 3
elif x>=2 and x<2.5 :
return 4
elif x>=2.5 and x<3 :
return 5
elif x>=3 and x<3.5 :
return 6
elif x>=3.5 and x<4 :
return 7
elif x>=4 and x<4.5 :
return 8
elif x>=4.5 and x<5 :
return 9
elif x>=5 and x<5.5 :
return 10
elif x>=5.5 and x<6 :
return 11
elif x>=6 and x<6.5 :
return 12
elif x>=6.5 and x<7 :
return 13
elif x>=7 and x<7.5 :
return 14
elif x>=7.5 and x<8 :
return 15
elif x>=8 and x<8.5 :
return 16
elif x>=8.5 and x<9 :
return 17
elif x>=9 and x<9.5 :
return 18
elif x>=9.5 :
return 19
def labelize_revenue(x):
# It splits the revenue into 3 categories : low revenue, middle revenue and high revenue
# We took 25% and 75% of the revenue (as shown in the previous cell)
if x <= 19283110 :
return 1
elif x> 19283110 and x<=155633200 :
return 2
else :
return 3
def discretize_grade_20(liste_ds) :
# Discretize the grade into 20 categories
new_ds = []
for df in liste_ds :
new_df = df.copy()
new_df.grade = new_df.grade.apply(labelize)
new_ds.append(new_df)
return new_ds
def discretize_revenue(liste_ds) :
# Discretization of the revenue into 3 classes : low, medium and high
new_ds = []
for df in liste_ds :
new_df = df.copy()
new_df.revenue = new_df.revenue.apply(labelize_revenue)
new_ds.append(new_df)
return new_ds
from sklearn.preprocessing import StandardScaler
import seaborn as sns
# Creation of an other class in order to print the features, change the label, delete some features, ...
class Dataset :
# Initialization with a Preprocess object
def __init__(self, pre_process) :
self.pre_process = copy.deepcopy(pre_process)
# Data where grade is the label
self.grade_set = []
# Data where ratio is the label
self.ratio_set = []
# Data where revenue is the label
self.revenue_set = []
self.scaler = StandardScaler()
def remove_feature(self,name_begin, name_end) :
# It will remove features from name_begin until name_en
for n,ds in enumerate(self.pre_process.train) :
i_min = ds.columns.get_loc(name_begin)
i_max = ds.columns.get_loc(name_end)
index = [i for i in range(i_min, i_max+1)]
ds.drop(ds.columns[index], axis=1, inplace=True)
self.pre_process.test[n].drop(ds.columns[index], axis=1, inplace=True)
def print_features(self) :
# Prints the features
print(self.pre_process.train[0].columns)
def set_sets(self):
# It will split the data according to the 3 differents labels
# It corresponds to self.grade_set, self.ratio_set and self.revenue_set
for index,x in enumerate(self.pre_process.train) :
# Returns X_train, X_test, y_train, y_test
data_t_g , data_te_g = x.copy(), self.pre_process.test[index].copy()
data_t_g.drop(['ratio', 'revenue'], axis = 1, inplace = True)
data_te_g.drop(['ratio', 'revenue'], axis = 1, inplace = True)
self.grade_set.append(split_data(data_t_g,data_te_g, 'grade'))
data_t_r , data_te_r = x.copy(), self.pre_process.test[index].copy()
data_t_r.drop(['grade', 'revenue'], axis = 1, inplace = True)
data_te_r.drop(['grade', 'revenue'], axis = 1, inplace = True)
self.ratio_set.append(split_data(data_t_r,data_te_r, 'ratio'))
data_t_re , data_te_re = x.copy(), self.pre_process.test[index].copy()
data_t_re.drop(['grade', 'ratio'], axis = 1, inplace = True)
data_te_re.drop(['grade', 'ratio'], axis = 1, inplace = True)
self.revenue_set.append(split_data(data_t_re,data_te_re, 'revenue'))
def discretize(self, n) :
# It will discretize the grade.
# If n == 0, it will create 100 labels otherwise 20 labels
if n == 0 :
for index, x in enumerate(self.pre_process.train) :
x.grade = discretize_grade_100(self.pre_process.train)[0].grade
self.pre_process.test[index].grade = discretize_grade_100(self.pre_process.test)[0].grade
else :
for index, x in enumerate(self.pre_process.train) :
x.grade = discretize_grade_20(self.pre_process.train)[0].grade
self.pre_process.test[index].grade = discretize_grade_20(self.pre_process.test)[0].grade
def discretize_revenue(self) :
for index, x in enumerate(self.pre_process.train) :
x.revenue = discretize_revenue(self.pre_process.train)[0].revenue
self.pre_process.test[index].revenue = discretize_revenue(self.pre_process.test)[0].revenue
def inter_scalar(self, n) :
# If n == 0, it will normalize
# Else, it will denormalize
liste_data = [self.grade_set ,self.ratio_set ,self.revenue_set]
new_liste = []
for data in liste_data :
new_data = []
for x_train, x_test, y_train, y_test in data :
if n == 0 :
self.scaler.fit(np.concatenate((x_train,x_test)))
new_data.append([self.scaler.transform(x_train),\
self.scaler.transform(x_test), y_train,y_test])
else :
new_data.append([self.scaler.inverse_transform(x_train),\
self.scaler.inverse_transform(x_test), y_train,y_test])
new_liste.append(new_data)
self.grade_set = new_liste[0]
self.ratio_set = new_liste[1]
self.revenue_set = new_liste[2]
def normalize(self ):
# Normalization of the data
self.inter_scalar(0)
def denormalize(self) :
# Denormalization of the data
self.inter_scalar(1)
def get_grade_data(self,index) :
# Returns the data where grade is the label
return self.grade_set[index]
def get_ratio_data(self, index) :
# Returns the data where ratio is the label
return self.ratio_set[index]
def get_revenue_data(self, index) :
# Returns the data where revenue is the label
return self.revenue_set[index]
def plot_correl_matrix(self,index) :
# Plots the correlation matrix
# Index will tell which of the train set it will print
# Useful when there are a split by genre
# If there are no split by genre, takes 0 will work
f, ax = plt.subplots(figsize=(9, 9))
sns.heatmap(self.pre_process.train[index].corr(), vmax=1, square=True)
def plot_par_correl_matrix(self, index) :
# Does the same as the previous function but will only keep the label columns
f, ax = plt.subplots(figsize=(25, 15))
sns.heatmap(self.pre_process.train[index].corr()[['grade','revenue','ratio']], vmax=1, square=True)
# Creation of the data
dataset_all = Dataset(pre_process)
dataset_all.set_sets()
dataset_all.normalize()
# Prints the correlation matrix
dataset_all.plot_correl_matrix(0)
# Prints only the revelant columns
dataset_all.plot_par_correl_matrix(0)
# Prints the revelant features that we'll keep for the moment
dataset_all.print_features()
# We split our dataset into training set and testing set
# We create it for each different label
# Here there's no split of genre
X_train_r, X_test_r , y_train_r , y_test_r = dataset_all.get_ratio_data(0)
X_train_re, X_test_re , y_train_re , y_test_re = dataset_all.get_revenue_data(0)
X_train_g, X_test_g , y_train_g , y_test_g = dataset_all.get_grade_data(0)
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
# Class to do regression and to vizualise it easily
class Regressors :
def __init__(self,X_train, y_train, X_test, y_test ) :
# We'll mention in this class the regressior as an index to simplify
# All the function will use "index" that corresponds to the respective regressor in this list
self.regressors_name = ['Linear Regression', 'KNeighborsRegressor' ,\
'Stochastic Gradient Descent Regressor ' ,\
'Descision Tree Regression',\
'Random Forest Regressor',\
'Support Vector Regressor ']
self.X_train = X_train
self.y_train = y_train
self.X_test = X_test
self.y_test = y_test
# List with all the different parameters of the regressors
self.list_param = [0 for i in range (len(self.regressors_name)) ]
# List with all the score on the testing set
self.list_score = np.zeros((1,len(self.regressors_name)))[0]
# List with all the score on the training set after cross validation
self.liste_oof_score = np.zeros((1,len(self.regressors_name)))[0]
# List of all the regressors
self.classifiers = [0 for i in range (len(self.regressors_name)) ]
# List of all the functions that give the classifier with the good parameters
# We need to take into account that, as we compute the best parameters through the training set,
# it could lead to overfitting
# We take the risk for when we'll deal with the genre
self.functions = [None for i in range(len(self.regressors_name))]
def get_oof_score(self,index) :
# Returns the score after cross validation on the training set
clfr = SKlearnHelper(clf= self.classifiers[index], params = self.list_param[index])
oof_score = get_oof(clfr, self.X_train,self.y_train)
self.liste_oof_score[index] = oof_score
return oof_score
def get_score(self, index) :
# Returns the real score on the testing set
clfr = SKlearnHelper(clf= self.classifiers[index], params = self.list_param[index])
score = get_score(clfr,self.X_train,self.y_train,self.X_test,self.y_test)
self.list_score[index] = score
return score
def set_classifier(self, index, clf) :
# Sets the classifier
self.classifiers[index] = clf
def set_param(self, index, param) :
# Sets the parameter at a specific classifier
self.list_param[index] = param
if (self.classifiers[index] is not None ):
self.get_oof_score(index)
self.get_score(index)
def print_classifier_scores(self, index) :
# Prints both the score after cross validation on the training set and the score on the testing set
if (self.list_score[index] == 0 or self.liste_oof_score[index] == 0) :
self.get_oof_score(index)
self.get_score(index)
print("Score cross validation with " + self.regressors_name[index], self.liste_oof_score[index])
print("Score for generalization with "+ self.regressors_name[index], self.list_score[index])
def set_function(self, index, func) :
# Sets the function that will return a regressor with good parameters
self.functions[index] = func
def function(self,index) :
# It will create the regressor with the good parameters according to some criteria
if self.functions[index] is not None :
param,clf = self.functions[index](self.X_train, self.y_train, True)
self.set_param(index,param)
self.set_classifier(index, clf)
print("Parameters : ",param)
def set_regressors(self, liste_regressor) :
# Sets all the regressors
for index, x in enumerate(liste_regressor) :
self.set_classifier(index, x)
def set_functions(self,liste_function):
# Sets all the function of the regressors
for index, x in enumerate(liste_function) :
self.set_function(index, x)
def print_best_regressor(self) :
# It will prints the best regressors (for the training set) and returns its score with the testing set
index = np.argmax(self.liste_oof_score)
answer = "Best regressor : " + str(self.regressors_name[index]) +\
" for a score : " + str(self.list_score[index])
return answer
def print_all_score(self) :
# Prints all the score on the testing set for all the regressors
print("Score for the different regressors ")
score_regressors = {}
score_regressors['Score'] = self.list_score
score_class_df = pd.DataFrame(score_regressors, index =self.regressors_name)
print(score_class_df)
sns.set_color_codes("muted")
score_class_df['Regressors'] = self.regressors_name
sns.barplot(x='Score', y='Regressors', data=score_class_df, color="b")
plt.xlabel('Score')
plt.title('Regression Score')
plt.show()
def plot_prediction(self,name) :
# It takes the model that maximizes the scores
# Then, it will plot a graph with the real values and the predicted ones
# On the x axis, it will be the ith instance
index = np.argmax(self.liste_oof_score)
clf = self.classifiers[index]
print("Regressor chosen :",self.regressors_name[index])
params = self.list_param[index]
classifier = SKlearnHelper(clf=clf, params = params)
classifier.fit(self.X_train, self.y_train)
y_predict = classifier.predict(self.X_test)
f, ax = plt.subplots(figsize=(8, 8))
plt.scatter([i for i in range(len(y_predict))] ,y_predict, color='b', label="Prediction",\
marker='+')
plt.scatter([i for i in range(len(y_predict))],self.y_test,color='r',label="Real Value",\
marker='+')
plt.xlabel("Number of instance")
plt.ylabel(name)
plt.title("Comparaison of the model with the real values")
plt.legend()
plt.show()
# Creation of our Regressor class
regressors = Regressors(X_train_r, y_train_r, X_test_r, y_test_r,)
# List of all the regressors that we're going to use
liste_regressor = [LinearRegression,KNeighborsRegressor,SGDRegressor,DecisionTreeRegressor,\
RandomForestRegressor,SVR]
# List of all the functions that return the best regressor
liste_function = [0,get_knn, get_sgdr,get_dtr, get_rfr, 0 ]
# We set the previous information
regressors.set_regressors(liste_regressor)
regressors.set_functions(liste_function)
param_lr = {
'normalize':False
}
regressors.set_param(0, param_lr)
# Prints the score for cross validation and for generalization
regressors.print_classifier_scores(0)
# Fits the best parameters for the model
regressors.function(1)
# We choose the best parameters in order not to overfit
param_knn = {
'algorithm':'auto',
'n_neighbors':50
}
regressors.set_param(1, param_knn)
regressors.print_classifier_scores(1)
# Fits the best parameters for the model
regressors.function(2)
# Prints the results
regressors.print_classifier_scores(2)
# Fits the best parameters for the model
regressors.function(3)
# Prints the results
regressors.print_classifier_scores(3)
# Fits the best parameters for the model
regressors.function(4)
param_rfr = {
'n_estimators': 7, 'max_features': 'auto', 'random_state': 1, 'max_depth': 1
}
regressors.set_param(4, param_rfr)
# Prints the results
regressors.print_classifier_scores(4)
# Use of a linear SVR
param_svr = {
'kernel':'linear'
}
regressors.set_param(5,param_svr)
# Prints the results
regressors.print_classifier_scores(5)
# Prints all the results together
regressors.print_all_score()
#Â A simple plot of the predicted outputs and the real ones to see the differences
#Â We use the best regressor to plot the predicted outputs
plot_predict_real(regressors, 50, "Ratio", "Descision Tree Regression", [0,6],4)
# Creates a Regressor instance to to regression with the revenue
regressor_revenue = Regressors(X_train_re, y_train_re, X_test_re, y_test_re,)
# Set the regressors and the functions that compute the best parameters for each regressor
regressor_revenue.set_regressors(liste_regressor)
regressor_revenue.set_functions(liste_function)
param_lr = {
'normalize':False
}
regressor_revenue.set_param(0, param_lr)
regressor_revenue.print_classifier_scores(0)
# Computes the best k parameter
regressor_revenue.function(1)
# We choose the best parameters in order to not overfit
regressor_revenue.set_param(1, param_knn)
regressor_revenue.print_classifier_scores(1)
# Computes the best loss and penalty functions
regressor_revenue.function(2)
regressor_revenue.print_classifier_scores(2)
# Computes the best number of max depth
regressor_revenue.function(3)
regressor_revenue.print_classifier_scores(3)
# Computes the best number of max depth and the number of estimators
regressor_revenue.function(4)
param_rfr_re = {
'max_depth':6,
'random_state':1,
'max_features':'auto',
'n_estimators':10
}
regressor_revenue.set_param(4,param_rfr_re)
regressor_revenue.print_classifier_scores(4)
param_svr = {
'kernel':'linear'
}
regressor_revenue.set_param(5,param_svr)
regressor_revenue.print_classifier_scores(5)
regressor_revenue.print_all_score()
# Plots the predicted versus the real values for the 50 first ones
plot_predict_real(regressor_revenue, 50, "Revenue","Linear Regression",\
[0,10**9],5)
# Initialization
regressor_grade = Regressors(X_train_g, y_train_g, X_test_g, y_test_g)
# Makes the function and the parameters initialisation
regressor_grade.set_regressors(liste_regressor)
regressor_grade.set_functions(liste_function)
param_lr = {
'normalize':False
}
regressor_grade.set_param(0, param_lr)
regressor_grade.print_classifier_scores(0)
regressor_grade.function(1)
regressor_grade.print_classifier_scores(1)
regressor_grade.function(2)
regressor_grade.print_classifier_scores(2)
regressor_grade.function(3)
regressor_grade.print_classifier_scores(3)
regressor_grade.function(4)
param_rfr_re = {
'max_depth':5,
'random_state':1,
'max_features':'auto',
'n_estimators':5
}
regressor_grade.set_param(4,param_rfr_re)
regressor_grade.print_classifier_scores(4)
param_svr = {
'kernel':'linear'
}
regressor_grade.set_param(5,param_svr)
regressor_grade.print_classifier_scores(5)
# Prints all the previous results
regressor_grade.print_all_score()
regressor_grade.plot_prediction("Grade")
# Creates a new preprocess in order to create the split by genre
pre_process_genre = Preprocess(data)
# Splits by genre : we only keep the 8 first genres
pre_process_genre.split_by_genre(8)
# We split it into train and test
pre_process_genre.split_train_test()
# Computes the keywords_grade, cast_grade and crew_grade
pre_process_genre.compute_grade(liste_all_grade)
# Creates the dataset to remove some features and normalize
dataset_genre = Dataset(pre_process_genre)
# Creates the three labels : ratio, revenue and grade
dataset_genre.set_sets()
# Normalization
dataset_genre.normalize()
# List of all the genre used
liste_genres = ['Drama', 'Comedy', 'Thriller', 'Action', 'Adventure', 'Romance','Crime',\
'Science Fiction']
# List of the values of the correlation matrix by genre
liste_corr_genre = [dataset_genre.pre_process.train[i].corr() for i in range(len(liste_genres))]
# We plot the different matrix correlation
for index, x in enumerate(liste_corr_genre) :
print("Genre : ",liste_genres[index])
sns.heatmap(x, vmax=1, square=True)
plt.show()
def compare_genre_regression(liste_genres, dataset_genre) :
# Compares all the algorithm of regression for all the genre set
answer = []
liste_regressors = []
for i in range(len(liste_genres)) :
X_train, X_test, y_train, y_test = dataset_genre.get_grade_data(i)
print(liste_genres[i])
regressor = Regressors(X_train, y_train, X_test, y_test)
regressor.set_regressors(liste_regressor)
regressor.set_functions(liste_function)
param_lr = {
'normalize':False
}
regressor.set_param(0, param_lr)
param_svr = {
'kernel':'linear'
}
regressor.set_param(5,param_svr)
for j in range(len(liste_regressor)) :
if not(j == 0 or j == 5) :
regressor.function(j)
regressor.print_classifier_scores(j)
print("For regressor : ", liste_genres[i])
regressor.print_all_score()
liste_regressors.append(regressor)
answer.append(regressor.print_best_regressor())
return answer , liste_regressors
answer_grade, regressors_genre_grade = compare_genre_regression(liste_genres, dataset_genre)
print(answer_grade)
# It will plot the regressors, score for each genre
def plot_graph_genre(regressors,liste_genres) :
liste_max_score = []
liste_name_reg = []
for index, regressor in enumerate(regressors) :
max_score = np.max(regressor.list_score)
index_max = np.argmax(regressor.list_score)
name_reg = regressor.regressors_name[index_max]
liste_max_score.append(max_score)
liste_name_reg.append(name_reg)
score_regressors = {}
score_regressors['Score'] = liste_max_score
score_regressors['Regressor used'] = liste_name_reg
score_class_df = pd.DataFrame(score_regressors, index =liste_genres)
print(score_class_df)
plot_graph_genre(regressors_genre_grade,liste_genres)
def compare_genre_regression_re(liste_genres, dataset_genre) :
# Compares all the algorithm of regression for all the genre set
answer = []
liste_regressors = []
for i in range(len(liste_genres)) :
X_train, X_test, y_train, y_test = dataset_genre.get_revenue_data(i)
print(liste_genres[i])
regressor = Regressors(X_train, y_train, X_test, y_test)
regressor.set_regressors(liste_regressor)
regressor.set_functions(liste_function)
param_lr = {
'normalize':False
}
regressor.set_param(0, param_lr)
param_svr = {
'kernel':'linear'
}
regressor.set_param(5,param_svr)
for j in range(len(liste_regressor)) :
if not(j == 0 or j == 5) :
regressor.function(j)
regressor.print_classifier_scores(j)
print("For regressor : ", liste_genres[i])
regressor.print_all_score()
liste_regressors.append(regressor)
answer.append(regressor.print_best_regressor())
return answer , liste_regressors
answer_revenue, regressors_revenue_grade = compare_genre_regression_re(liste_genres, dataset_genre)
print(answer_revenue)
plot_graph_genre(regressors_revenue_grade,liste_genres)
def compare_genre_regression_ratio(liste_genres, dataset_genre) :
# Compares all the algorithm of regression for all the genre set
answer = []
liste_regressors = []
for i in range(len(liste_genres)) :
X_train, X_test, y_train, y_test = dataset_genre.get_ratio_data(i)
print(liste_genres[i])
regressor = Regressors(X_train, y_train, X_test, y_test)
regressor.set_regressors(liste_regressor)
regressor.set_functions(liste_function)
param_lr = {
'normalize':False
}
regressor.set_param(0, param_lr)
param_svr = {
'kernel':'linear'
}
regressor.set_param(5,param_svr)
for j in range(len(liste_regressor)) :
if not(j == 0 or j == 5) :
regressor.function(j)
regressor.print_classifier_scores(j)
print("For regressor : ", liste_genres[i])
regressor.print_all_score()
liste_regressors.append(regressor)
answer.append(regressor.print_best_regressor())
return answer , liste_regressors
answer_ratio, regressors_ratio_genre = compare_genre_regression_ratio(liste_genres, dataset_genre)
plot_graph_genre(regressors_ratio_genre,liste_genres)
# Create the instance of the Dataset class in order to normalize, split and remove features
dataset_all_c = Dataset(pre_process)
# Discretizes the grade into 100 labels
dataset_all_c.discretize(0)
# Splits the set into training and testing sets
dataset_all_c.set_sets()
# Normalization of the input
dataset_all_c.normalize()
# Gets the train and testing sets
X_train_g_100, X_test_g_100 , y_train_g_100, y_test_g_100 = dataset_all_c.get_grade_data(0)
# Class to do Classification and to vizualise it easily
class Classifications :
def __init__(self,X_train, y_train, X_test, y_test ) :
# Name of the classifiers used
self.classificators_name = ['LDA','LogisticRegression','Perceptron',\
'SVC','Decision Trees','RandomForestClassifier','AdaBoostClassifier']
# Input training set
self.X_train = X_train
# Output training set
self.y_train = y_train
# Input testing set
self.X_test = X_test
# Output testing set
self.y_test = y_test
# List of the parameters
self.list_param = [0 for i in range (len(self.classificators_name)) ]
# List of the accuracy on the testing set
self.list_score = np.zeros((1,len(self.classificators_name)))[0]
# List of the accuracy after cross validation
self.liste_oof_score = np.zeros((1,len(self.classificators_name)))[0]
# List of classifiers
self.classifiers = [0 for i in range (len(self.classificators_name)) ]
# List of the functions used to get the best parameters for the classifiers
self.functions = [None for i in range(len(self.classificators_name))]
def get_oof_score(self,index) :
# Gets the accuracy after cross validation of a given classifier
# index gives the number of the classifier in the name list
clfr = SKlearnHelper(clf= self.classifiers[index], params = self.list_param[index])
oof_score = get_oof_c(clfr, self.X_train,self.y_train)
self.liste_oof_score[index] = oof_score
return oof_score
def get_score(self, index) :
# Gets the accuracy of a given classifier
clfr = SKlearnHelper(clf= self.classifiers[index], params = self.list_param[index])
score = get_score_c(clfr,self.X_train,self.y_train,self.X_test,self.y_test)
self.list_score[index] = score
return score
def set_classifier(self, index, clf) :
# Sets the classifier at a given position
self.classifiers[index] = clf
def set_param(self, index, param) :
# Sets the parameters for a given classifier
self.list_param[index] = param
if (self.classifiers[index] is not None ):
self.get_oof_score(index)
self.get_score(index)
def print_classifier_scores(self, index) :
# Prints the accuracy on the training and testing set
if (self.list_score[index] == 0 or self.liste_oof_score[index] == 0) :
self.get_oof_score(index)
self.get_score(index)
print("Score cross validation with " + self.classificators_name[index], self.liste_oof_score[index])
print("Score for generalization with "+ self.classificators_name[index], self.list_score[index])
def set_function(self, index, func) :
# Sets the function to get the best parameters for a given classifier
self.functions[index] = func
def function(self,index) :
# Gets the best parameters for a given classifier
if self.functions[index] is not None :
param,clf = self.functions[index](self.X_train, self.y_train, True)
self.set_param(index,param)
self.set_classifier(index, clf)
print("Parameters : ",param)
def set_regressors(self, liste_regressor) :
# Sets the classifier to the given position
for index, x in enumerate(liste_regressor) :
self.set_classifier(index, x)
def set_functions(self,liste_function):
# Sets the function to a given classifier
for index, x in enumerate(liste_function) :
self.set_function(index, x)
def print_all_score(self) :
# Prints all the accuracy for all the different classifiers
print("Score for the different classifiers ")
score_regressors = {}
score_regressors['Score'] = self.list_score
score_class_df = pd.DataFrame(score_regressors, index =self.classificators_name)
print(score_class_df)
sns.set_color_codes("muted")
score_class_df['Classifiers'] = self.classificators_name
sns.barplot(x='Score', y='Classifiers', data=score_class_df, color="b")
plt.xlabel('Score')
plt.title('Regression Score')
plt.show()
def plot_prediction(self,name) :
# It takes the model that maximizes the accuracy
index = np.argmax(self.list_score)
clf = self.classifiers[index]
print("Regressor chosen :",self.classificators_name[index])
params = self.list_param[index]
classifier = SKlearnHelper(clf=clf, params = params)
classifier.fit(self.X_train, self.y_train)
y_predict = classifier.predict(self.X_test)
f, ax = plt.subplots(figsize=(8, 8))
plt.scatter([i for i in range(len(y_predict))] ,y_predict, color='b', label="Prediction",\
marker='+')
plt.scatter([i for i in range(len(y_predict))],self.y_test,color='r',label="Real Value",\
marker='+')
plt.xlabel("Number of instance")
plt.ylabel(name)
plt.title("Comparaison of the model with the real values")
plt.legend()
plt.show()
def print_best_classifier(self) :
# It will prints the best classifiers (for the training set) and
# returns its accuracy with the testing set
index = np.argmax(self.liste_oof_score)
answer = "Best regressor : " + str(self.classificators_name[index]) +\
" for a score : " + str(self.list_score[index])
return answer
# Classifier instance that will be used to train our model on our data
# We take the grade with 100 label as output
classifier_100 = Classifications(X_train_g_100, y_train_g_100, X_test_g_100 , y_test_g_100)
# Declaration of the functions and classifiers that we'r going to use
liste_function_classifier = [get_lda , get_lreg, get_perc , get_svm,get_dtc ,\
get_rfc, get_ada]
liste_classifiers = [LinearDiscriminantAnalysis, LogisticRegression,Perceptron,\
SVC, DecisionTreeClassifier, RandomForestClassifier, AdaBoostClassifier]
# Sets the functions and classifier to our instance
classifier_100.set_functions(liste_function_classifier)
classifier_100.set_regressors(liste_classifiers)
# Fits the data with the best parameters and then it retunrs the score for LDA
classifier_100.function(0)
classifier_100.print_classifier_scores(0)
# Fits the data with the best parameters and then it retunrs the score for Logistic Regression
classifier_100.function(1)
classifier_100.print_classifier_scores(1)
# Fits the data with the best parameters and then it retunrs the score for Perceptron
classifier_100.function(2)
classifier_100.print_classifier_scores(2)
# Fits the data with the best parameters and then it retunrs the score for SVMC
classifier_100.function(3)
classifier_100.print_classifier_scores(3)
# Fits the data with the best parameters and then it retunrs the score for Decision Trees
classifier_100.function(4)
classifier_100.print_classifier_scores(4)
# Fits the data with the best parameters and then it retunrs the score for Random Forest Classifier
classifier_100.function(5)
classifier_100.print_classifier_scores(5)
# Fits the data with the best parameters and then it retunrs the score for Adaboost Classifier
classifier_100.function(6)
classifier_100.print_classifier_scores(6)
# Prints the result and summarize it
classifier_100.print_all_score()
# Plots the 50 first instances of the testing set with the real output and the predicted one
plot_predict_real(classifier_100, 50, "Grade discretized with 100 labels", \
"Random Forest Classifier", [-1,101],6)
# Creation of Dataset instance
dataset_all_c_20 = Dataset(pre_process)
# Discretize as 20 labels
dataset_all_c_20.discretize(1)
# Spits it into train and test sets
dataset_all_c_20.set_sets()
# Normalization
dataset_all_c_20.normalize()
# Splits it into training and testing sets
X_train_g_20, X_test_g_20 , y_train_g_20, y_test_g_20 = dataset_all_c_20.get_grade_data(0)
# Creation of Classifier instance
classifier_20 = Classifications(X_train_g_20, y_train_g_20, X_test_g_20 , y_test_g_20)
# Sets the function to get the best parameters
classifier_20.set_functions(liste_function_classifier)
# Sets the classifiers that we're going to use
classifier_20.set_regressors(liste_classifiers)
# Fits the data with the best parameters and then it retunrs the score for LDA
classifier_20.function(0)
classifier_20.print_classifier_scores(0)
# Fits the data with the best parameters and then it retunrs the score for Logistic Regression
classifier_20.function(1)
classifier_20.print_classifier_scores(1)
# Fits the data with the best parameters and then it retunrs the score for Perceptron
classifier_20.function(2)
classifier_20.print_classifier_scores(2)
# Fits the data with the best parameters and then it retunrs the score for SVMC
classifier_20.function(3)
classifier_20.print_classifier_scores(3)
# Fits the data with the best parameters and then it retunrs the score for Decision Trees
classifier_20.function(4)
classifier_20.print_classifier_scores(4)
# Fits the data with the best parameters and then it retunrs the score for Random Forest Classifier
classifier_20.function(5)
# Prints the results
classifier_20.print_classifier_scores(5)
# Fits the data with the best parameters and then it retunrs the score for Adaboost Classifier
classifier_20.function(6)
classifier_20.print_classifier_scores(6)
# Prints the result and summarize it
classifier_20.print_all_score()
# Plots the 50 first instances of the testing set with the real output and the predicted one
plot_predict_real(classifier_20, 50, "Grade discretized", "Linear Regression", [0,20],-2)
# Creation of Dataset instance
dataset_all_r = Dataset(pre_process)
# Discretizes the revenue as 3 labels : low revenue, medium revenue and high revenue
dataset_all_r.discretize_revenue()
# Splits to train and test sets
dataset_all_r.set_sets()
# Normalization of the inputs
dataset_all_r.normalize()
# Creation of our training and testing subsets
X_train_c_r, X_test_c_r , y_train_c_r, y_test_c_r = dataset_all_r.get_revenue_data(0)
# Creation of a Classification instance
classifier_r = Classifications(X_train_c_r, y_train_c_r, X_test_c_r , y_test_c_r)
# Sets the functions to get the best parameters
classifier_r.set_functions(liste_function_classifier)
# Sets the classifiers
classifier_r.set_regressors(liste_classifiers)
# Loops through all the classifiers
for i in range(len(classifier_r.classificators_name)) :
# Gets the best parameters through a cross validation process
# to get the scores with the current parameters
classifier_r.function(i)
# Pritns the accuracy of the classifiers
classifier_r.print_classifier_scores(i)
# Plots the results of the previous process with all the accuracy of all the classifiers
classifier_r.print_all_score()
# Plots the 50 first instances of our testing set with the real and the predicted outputs
plot_predict_real(classifier_r, 50, "Revenue discretized by 3", "Random Forest Classifier",\
[0.8,4],6)
# Creation of an instance of Preprocess to deal with the dataset
pre_process_genre = Preprocess(data)
# Splits the dataset into 8 genres subsets
pre_process_genre.split_by_genre(8)
# Splits into train and test
pre_process_genre.split_train_test()
# Computes the keyword_grade, crew_grade and cast_grade
pre_process_genre.compute_grade(liste_all_grade)
# Creation of a Dataset instance
dataset_genre = Dataset(pre_process_genre)
# Discretization into 100 labels
dataset_genre.discretize(0)
# Splits into train and test subsets
dataset_genre.set_sets()
# Normalization of the inputs
dataset_genre.normalize()
# List with the names of the 8 genres that we'll use
liste_genres = ['Drama', 'Comedy', 'Thriller', 'Action', 'Adventure', 'Romance','Crime',\
'Science Fiction']
# List of the correlation matrixes by genre
liste_corr_genre = [dataset_genre.pre_process.train[i].corr() for i in range(len(liste_genres))]
for index, x in enumerate(liste_corr_genre) :
print(liste_genres[index])
sns.heatmap(x, vmax=1, square=True)
plt.show()
# Functions that we'll returns the classifiers that best fit the data
# It will also give, for each genre, the accuracy and the classifier used to get its accuracy
# It is made for grade as label
def compare_classification_genre(liste_genres, dataset_genre) :
# Compares all the algorithm of regression for all the genre set
answer = []
liste_function_classifier = [get_lda , get_lreg, get_perc , get_svm,get_dtc ,\
get_rfc, get_ada]
liste_classifiers = [LinearDiscriminantAnalysis, LogisticRegression,Perceptron,\
SVC, DecisionTreeClassifier, RandomForestClassifier, AdaBoostClassifier]
liste_class = []
for i in range(len(liste_genres)) :
X_train, X_test, y_train, y_test = dataset_genre.get_grade_data(i)
print(liste_genres[i])
classifier = Classifications(X_train, y_train, X_test, y_test)
classifier.set_regressors(liste_classifiers)
classifier.set_functions(liste_function_classifier)
for j in range(1,len(liste_function_classifier)) :
classifier.function(j)
classifier.print_classifier_scores(j)
print("For Classifier : ", liste_genres[i])
classifier.print_all_score()
liste_class.append(classifier)
answer.append(classifier.print_best_classifier())
return answer , liste_class
# Gets the results for all the genres
answer_c_100 , liste_classifiers_g_100 = compare_classification_genre(liste_genres, dataset_genre)
# It will plot the classifiers, accuracy for each genre
def plot_graph_genre_c(classifiers,liste_genres) :
liste_max_score = []
liste_name_reg = []
for i in range(len(liste_genres)) :
max_score = np.max(classifiers[i].list_score)
index_max = np.argmax(classifiers[i].list_score)
name_reg = classifiers[i].classificators_name[index_max]
liste_max_score.append(max_score)
liste_name_reg.append(name_reg)
score_classifiers = {}
score_classifiers['Accuracy'] = liste_max_score
score_classifiers['Classifier used'] = liste_name_reg
score_class_df = pd.DataFrame(score_classifiers, index =liste_genres)
print(score_class_df)
# Prints the results for each genre with 100 labels as grade
plot_graph_genre_c(liste_classifiers_g_100,liste_genres)
# Creation of a Dataset instance
dataset_genre_20 = Dataset(pre_process_genre)
# Discretization into 20 labels
dataset_genre_20.discretize(1)
# Splits into train and test subsets
dataset_genre_20.set_sets()
# Normalization of the inputs
dataset_genre_20.normalize()
# Gets the results for all the genres
answer_c_20 , liste_classifiers_g_20 = compare_classification_genre(liste_genres, dataset_genre_20)
# Prints the results for each genre with 20 labels as grade
print(len(liste_classifiers_g_20))
plot_graph_genre_c(liste_classifiers_g_20,liste_genres)
# Creation of a Dataset instance
dataset_genre_r = Dataset(pre_process_genre)
# Discretization revenue into 3 labels
dataset_genre_r.discretize_revenue()
# Splits into train and test subsets
dataset_genre_r.set_sets()
# Normalization of the inputs
dataset_genre_r.normalize()
# Functions that we'll returns the classifiers that best fit the data
# It will also give, for each genre, the accuracy and the classifier used to get its accuracy
# It is made for revenue label
def compare_revenue_genre(liste_genres, dataset_genre) :
# Compares all the algorithm of regression for all the genre set
answer = []
liste_function_classifier = [get_lda , get_lreg, get_perc , get_svm,get_dtc ,\
get_rfc, get_ada]
liste_classifiers = [LinearDiscriminantAnalysis, LogisticRegression,Perceptron,\
SVC, DecisionTreeClassifier, RandomForestClassifier, AdaBoostClassifier]
liste_class = []
#Â We make 1 here because for the other genres, there are cases where there is only one output class
for i in range(1) :
print(liste_genres[i])
X_train, X_test, y_train, y_test = dataset_genre.get_revenue_data(i)
classifier = Classifications(X_train, y_train, X_test, y_test)
classifier.set_regressors(liste_classifiers)
classifier.set_functions(liste_function_classifier)
for j in range(1,len(liste_function_classifier)) :
classifier.function(j)
classifier.print_classifier_scores(j)
print("For Classifier : ", liste_genres[i])
classifier.print_all_score()
liste_class.append(classifier)
answer.append(classifier.print_best_classifier())
return answer , liste_class
# Gets the results for all the genres
answer_r , liste_classifiers_r = compare_revenue_genre(liste_genres, dataset_genre_r)
# Gets the coefficients
coefs = regressor_grade.classifiers[0]().fit(regressor_grade.X_train, regressor_grade.y_train).coef_
# Gets the name of the features
name_features = pre_process.train[0].columns
# Index where the features are the highest
index_max = np.where( coefs >= 1.5e11)[0]
coefs
new_column = np.array([x for x in data.columns if (x!='revenue' and x!='ratio' and x!='grade')])[index_max]
# Plots
plt.xticks(rotation=90, fontsize = 15)
plt.ylabel("Coefficient value on linear regression")
plt.xlabel("Features")
plt.bar(new_column, coefs[index_max], align = 'center', color='b')
plt.title("Coefficient of linear regression depending on the features")
plt.show()
# Plots all
new_column = np.array([x for x in data.columns if (x!='revenue' and x!='ratio' and x!='grade')])
plt.xticks(rotation=90, fontsize = 10)
plt.ylabel("Coefficient value on linear regression")
plt.xlabel("Features")
plt.bar(new_column, coefs, align = 'center', color='b')
plt.title("Coefficient of linear regression depending on the features")
plt.show()
As a conclusion, it seems that for the linear regression, the higher the parameters are the more important the feature is. In this effect, a movie which is produced in France or where the language is english is more likely to have a good impact on the success of a movie.